Loading Libraries

library(magrittr)
library(dplyr)
library(tidyr)
library(stringr)
library(purrr)
library(ggplot2)
library(plotly)
library(scales)
library(gridExtra)
library(viridis)
library(MASS)
library(tigris)
library(acs)
library(leaflet)
library(tibble)
library(htmlwidgets)
library(data.table)

Reading Data

# police.killings.df <- read.csv('police_killings_2015.csv', header = TRUE)
# police.killings.df <- police.killings.df %>%
#                         mutate(age_n = as.numeric(age),
#                                age_f = case_when(
#                                  age_n < 18 ~ 'Minor',
#                                  age_n < 30 ~ 'Age 18 - 30',
#                                  age_n < 50 ~ 'Age 30 - 50',
#                                  age_n < 70 ~ 'Age 50 - 70',
#                                  age_n >= 70 ~ 'Senior Citizen'
#                                ),
#                                age_f = factor(age_f, levels = c('Minor', 'Age 18 - 30', 'Age 30 - 50', 'Age 50 - 70', 'Senior Citizen')),
#                                raceethnicity =  factor(raceethnicity, levels = c('White', 'Black', 'Hispanic/Latino', 'Asian/Pacific Islander', 'Native American', 'Unknown')),
#                                month = factor(month, levels = c('January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December')))
# 
# police.killings.df <- na.omit(police.killings.df)
# 
# police.killings.df$census_code <- apply(police.killings.df, 1, function(row) call_geolocator_latlon(row['latitude'], row['longitude']))
# 
# str(police.killings.df)
# police.killings.df$tract <- substr(police.killings.df$census_code, 6, 11)
# 
# police.killings.df$tract <- as.numeric(police.killings.df$tract)
# 
# police.killings.df$code <- substr(police.killings.df$census_code, 1, 11)
# 
# police.killings.df$code <- as.numeric(police.killings.df$code)
census.df <- read.csv('https://s3.amazonaws.com/eda-proj-2019/PDB_2015_Tract.csv') %>%
  dplyr::select('State', 'County', 'Tract', 'Tot_Population_CEN_2010', 'pct_Hispanic_CEN_2010', 'pct_NH_White_alone_CEN_2010', 'pct_NH_Blk_alone_CEN_2010', 'pct_NH_AIAN_alone_CEN_2010', 'pct_NH_Asian_alone_CEN_2010', 'pct_NH_NHOPI_alone_CEN_2010', 'pct_College_ACS_09_13', 'pct_Prs_Blw_Pov_Lev_ACS_09_13', 'pct_Civ_unemp_16p_ACS_09_13', 'Aggr_House_Value_ACS_09_13')

str(census.df)
## 'data.frame':    74021 obs. of  14 variables:
##  $ State                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ County                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Tract                        : int  20100 20200 20300 20400 20500 20600 20700 20801 20802 20900 ...
##  $ Tot_Population_CEN_2010      : int  1912 2170 3373 4386 10766 3668 2891 3081 10435 5675 ...
##  $ pct_Hispanic_CEN_2010        : num  2.3 3.46 2.58 1.94 3.3 4.8 3.39 1.85 1.52 1.69 ...
##  $ pct_NH_White_alone_CEN_2010  : num  83.7 38.9 75.2 91.9 78.4 ...
##  $ pct_NH_Blk_alone_CEN_2010    : num  11.35 55.94 19.18 4.35 13.17 ...
##  $ pct_NH_AIAN_alone_CEN_2010   : num  0.68 0.23 0.27 0.25 0.41 0.27 0.24 0.81 0.45 0.25 ...
##  $ pct_NH_Asian_alone_CEN_2010  : num  0.73 0.23 0.5 0.41 2.74 0.16 0.45 0.52 0.57 0.33 ...
##  $ pct_NH_NHOPI_alone_CEN_2010  : num  0 0 0.15 0.07 0.06 0 0.03 0 0.07 0 ...
##  $ pct_College_ACS_09_13        : num  26.7 20.7 17.4 24.1 39.2 ...
##  $ pct_Prs_Blw_Pov_Lev_ACS_09_13: num  10.52 15.38 13.9 2.95 7.8 ...
##  $ pct_Civ_unemp_16p_ACS_09_13  : num  2.79 15.3 3.58 11.51 5.28 ...
##  $ Aggr_House_Value_ACS_09_13   : Factor w/ 70290 levels "","$1,000,099,000",..: 64410 67805 7117 23971 50221 12483 63408 21129 49207 25207 ...
census2.df <- census.df %>%
  mutate(state2 = str_pad(State, 2, 'left', 0),
         county2 = str_pad(County, 3, 'left', 0),
         tract2 = str_pad(Tract, 6, 'left', 0),
         code = as.numeric(paste0(state2, county2, tract2)),
         tot_pop = Tot_Population_CEN_2010,
         share_hispanic = pct_Hispanic_CEN_2010,
         share_White = pct_NH_White_alone_CEN_2010,
         share_black = pct_NH_Blk_alone_CEN_2010,
         share_native = pct_NH_AIAN_alone_CEN_2010,
         share_asian = pct_NH_Asian_alone_CEN_2010 + pct_NH_NHOPI_alone_CEN_2010,
         share_unknown = 100 - (share_hispanic + share_White + share_black + share_native + share_asian),
         college = pct_College_ACS_09_13,
         below_poverty = pct_Prs_Blw_Pov_Lev_ACS_09_13,
         unemp = pct_Civ_unemp_16p_ACS_09_13,
         avg_house_value = as.integer(Aggr_House_Value_ACS_09_13)) %>%
    dplyr::select(county2, code, tot_pop, share_White, share_black, share_hispanic, share_native, share_asian, share_unknown, college, below_poverty, unemp, avg_house_value)
# 
# 
# police.killings2.df <- left_join(police.killings.df, census2.df)
# 
# str(police.killings2.df)
# 
# write.csv(police.killings2.df, 'police_killing_combined.csv')
police.killings2.df <- read.csv('police_killing_combined.csv', header = TRUE)
police.killings.df <- police.killings2.df

Univariate Analysis

police.killings.df %>%
  mutate(total = n()) %>%
    group_by(age_f) %>%
      mutate(perc = n()/total) %>%
    ungroup() %>%
      dplyr :: select(age_f, perc) %>%
        group_by(age_f) %>%
          summarise(frac = round(mean(perc), 2)) %>%
            ggplot(aes(x = age_f, y = frac * 100)) +
              geom_histogram(stat = 'identity', fill = '#E69F00', alpha = 0.8) +
                xlab('Age Groups') +
                  ylab('Percenatge') +
                    ggtitle('Police Killings by Age Groups') +
                      theme_bw() +
                        theme(text = element_text(size = 16),
                              axis.text = element_text(size = 14))

police.killings.df %>%
  mutate(total = n()) %>%
    group_by(gender) %>%
      mutate(perc = n()/total) %>%
    ungroup() %>%
      dplyr :: select(gender, perc) %>%
        group_by(gender) %>%
          summarise(frac = round(mean(perc), 2)) %>%
            ggplot(aes(x = gender, y = frac * 100)) +
              geom_histogram(stat = 'identity', fill = '#56B4E9', alpha = 0.8) +
                xlab('Gender') +
                  ylab('Percenatge') +
                    ggtitle('Police Killings by Gender') +
                      theme_bw() +
                        theme(text = element_text(size = 16),
                          axis.text = element_text(size = 14))

police.killings.df %>%
  mutate(total = n()) %>%
    group_by(raceethnicity) %>%
      mutate(perc = n()/total) %>%
    ungroup() %>%
      dplyr :: select(raceethnicity, perc) %>%
        group_by(raceethnicity) %>%
          summarise(frac = round(mean(perc), 2)) %>%
            ggplot(aes(x = raceethnicity, y = frac * 100)) +
              geom_histogram(stat = 'identity', fill = '#009E73', alpha = 0.8) +
                xlab('Race Ethnicity') +
                  ylab('Percentage') +
                    ggtitle('Police Killings by Race Ethnicity') +
                      theme_bw() +
                      theme(text = element_text(size = 16),
                          axis.text = element_text(size = 14),
                          axis.text.x = element_text(angle = 30, hjust = 1))

police.killings.df %>%
  mutate(total = n()) %>%
    group_by(armed) %>%
      mutate(perc = n()/total) %>%
    ungroup() %>%
      dplyr :: select(armed, perc) %>%
        group_by(armed) %>%
          summarise(frac = round(mean(perc), 2)) %>%
            ggplot(aes(x = armed, y = frac * 100)) +
              geom_histogram(stat = 'identity', fill = '#F0E442', alpha = 0.8) +
                xlab('Armed Info') +
                  ylab('Percentage') +
                    ggtitle('Police Killings by Armed Info') +
                      theme_bw() +
                        theme(text = element_text(size = 16),
                            axis.text = element_text(size = 14),
                            axis.text.x = element_text(angle = 30, hjust = 1))

police.killings.df %>%
  mutate(total = n()) %>%
    group_by(classification) %>%
      mutate(perc = n()/total) %>%
    ungroup() %>%
      dplyr :: select(classification, perc) %>%
        group_by(classification) %>%
          summarise(frac = round(mean(perc), 2)) %>%
            ggplot(aes(x = classification, y = frac * 100)) +
              geom_histogram(stat = 'identity', fill = '#0072B2', alpha = 0.8) +
                xlab('Cause of Death') +
                  ylab('Percentage') +
                    ggtitle('Police Killings by Cause of Death') +
                      theme_bw() +
                        theme(text = element_text(size = 16),
                            axis.text = element_text(size = 14),
                            axis.text.x = element_text(angle = 30, hjust = 1))

police.killings.df %>%
  mutate(total = n()) %>%
    group_by(month) %>%
      mutate(perc = n()/total) %>%
    ungroup() %>%
      dplyr :: select(month, perc) %>%
        group_by(month) %>%
          summarise(frac = round(mean(perc), 2)) %>%
            ggplot(aes(x = month, y = frac * 100)) +
              geom_histogram(stat = 'identity', fill = '#D55E00', alpha = 0.8) +
                xlab('Month') +
                  ylab('Percenatge') +
                    ggtitle('Police Killings by Month') +
                      theme_bw() +
                        theme(text = element_text(size = 16),
                            axis.text = element_text(size = 14),
                            axis.text.x = element_text(angle = 30, hjust = 1))

Bivariate Analysis

police.killings.df %>% 
  filter (armed != 'Disputed') %>%
  mutate(total = n(),
         age_f2 = case_when (
           age_f == 'Minor' ~ 'Minor',
           age_f == 'Age 18 - 30' ~ 'Age 18 - 30',
           age_f == 'Age 30 - 50' ~ 'Age > 30',
           age_f == 'Age 50 - 70' ~ 'Age > 30'
         ),
         age_f2 = factor(age_f2, levels = c('Minor', 'Age 18 - 30', 'Age > 30'))) %>%
    group_by(age_f2, armed) %>%
      mutate(perc = n()/total) %>%
    ungroup() %>%
      dplyr :: select(age_f2, armed, perc) %>%
        group_by(age_f2, armed) %>%
          summarise(frac = round(mean(perc), 2)) %>%
            ggplot(aes(x = armed, y = frac * 100, fill = armed)) +
              geom_histogram(stat = 'identity', alpha = 0.8) +
                facet_wrap( ~age_f2, ncol = 1) +
                xlab('Armed Info') +
                  ylab('Percenatge') +
                    ggtitle('Police Killings by Armed Info given Age Groups') +
                      theme_bw() +
                        theme(text = element_text(size = 16),
                              axis.text = element_text(size = 14),
                              axis.text.x = element_text(angle = 30, hjust = 1)) +
                            scale_fill_viridis_d()

police.killings.df %>%
  filter (armed != 'Disputed') %>%
  group_by(raceethnicity) %>%
    mutate(total = n()) %>%
  ungroup() %>%
    group_by(raceethnicity, armed) %>%
      mutate(perc = n()/total) %>%
    ungroup() %>%
      dplyr :: select(raceethnicity, armed, perc) %>%
        group_by(raceethnicity, armed) %>%
          summarise(frac = round(mean(perc), 2)) %>%
            ggplot(aes(x = armed, y = frac * 100, fill = armed)) +
              geom_histogram(stat = 'identity', alpha = 0.8) +
                facet_wrap( ~raceethnicity, ncol = 2) +
                xlab('Armed Info') +
                  ylab('Percenatge') +
                    ggtitle('Police Killings by Armed Info given Race ethnicity') +
                    labs(subtitle = 'Normalized by Race') +
                      theme_bw() +
                        theme(text = element_text(size = 16),
                              axis.text = element_text(size = 14),
                              axis.text.x = element_text(angle = 30, hjust = 1)) +
                          scale_fill_viridis_d()

police.killings.df %>%
  mutate(total = n()) %>%
    group_by(day, month) %>%
      mutate(count = n()) %>%
    ungroup() %>%
      dplyr :: select(day, month, count) %>%
        group_by(day, month) %>%
          summarise(total_count = round(mean(count), 2)) %>%
            ggplot(aes(x = day, y = total_count)) +
              geom_point() +
                geom_smooth(method = 'loess', method.args = list(degree = 1), se = FALSE) +
                facet_wrap( ~month) +
                xlab('Day') +
                  ylab('Count') +
                    ggtitle('Police Killings by Day for each Months') +
                      theme_bw() +
                        theme(text = element_text(size = 16),
                              axis.text = element_text(size = 14),
                              axis.text.x = element_text(angle = 30, hjust = 1))

Univariate Analysis

racial.killings.df <- police.killings.df %>%
                       rowid_to_column() %>%
                        dplyr::select(rowid, code, raceethnicity) %>%
                          group_by(code, raceethnicity) %>%
                            mutate(killing_count = n()) %>%
                          ungroup() %>%
                            spread(raceethnicity, killing_count)

racial.killings.df[is.na(racial.killings.df)] <- 0

                          
racial.killings2.df <- left_join(racial.killings.df, census2.df) %>%
                        mutate(white_killed = White * (100/ share_White),
                               black_killed = Black * (100/ share_black),
                               hispanic_killed = `Hispanic/Latino` * (100/share_hispanic),
                               asian_killed = `Asian/Pacific Islander` * (100/share_asian),
                               native_killed = `Native American` * (100/share_native),
                               unknown_killed = Unknown * (100/share_unknown)) %>%
                          dplyr::select(code, white_killed, black_killed, hispanic_killed, asian_killed, native_killed, unknown_killed) %>%
                            gather(key = 'race_killed', value = 'count', -code) 
## Joining, by = "code"
# str(racial.killings2.df)

#racial.killings2.df$count[is.nan(racial.killings2.df$count)] <- 0
#racial.killings2.df$count[!is.finite(racial.killings2.df$count)] <- 0


temp <-  racial.killings2.df$count
racial.killings2.df$count2 <- ifelse(is.finite(temp), temp, 0)

# print(racial.killings2.df$count2)

total <- sum(racial.killings2.df$count2)
print(total)
## [1] 9393.611
racial.killings2.df$total <- total

# str(racial.killings2.df)
    
racial.summary.df <- racial.killings2.df %>%
  mutate(race_killed = factor(race_killed, levels = c('white_killed', 'black_killed', 'hispanic_killed', 'asian_killed', 'native_killed', 'unknown_killed'), labels = c('White', 'Black', 'Hispanic/Latino', 'Asian/Pacific Islander', 'Native American', 'Unknown'))) %>%
  group_by(race_killed) %>%
    summarise(frac = sum(count2)/mean(total)) 

actuals <- c(0.625, 0.126, 0.149, 0.05, 0.009, 0.041)

racial.summary.df$actuals <- actuals 

print(racial.summary.df)
## # A tibble: 6 x 3
##   race_killed              frac actuals
##   <fct>                   <dbl>   <dbl>
## 1 White                  0.133    0.625
## 2 Black                  0.588    0.126
## 3 Hispanic/Latino        0.0884   0.149
## 4 Asian/Pacific Islander 0.0375   0.05 
## 5 Native American        0.0584   0.009
## 6 Unknown                0.0945   0.041
racial.summary.df %>% 
  gather(key = 'key', value = 'perc', -race_killed) %>%
      ggplot(aes(x = race_killed, y = perc * 100, fill = key)) +
              geom_bar(stat = 'identity', position = 'dodge', alpha = 0.8) +
                xlab('Race Ethnicity') +
                  ylab('Percentage') +
                    ggtitle('Police Killings and Actual Population Proportion by Race') +
                    labs(subtitle = 'Police Killings Adjusted by Racial Distribution of Tracts') +
                      theme_bw() +
                      theme(text = element_text(size = 16),
                          axis.text = element_text(size = 14),
                          axis.text.x = element_text(angle = 30, hjust = 1)) +
                        scale_fill_viridis_d(name = '', labels = c('Proportion in Population', 'Adjusted Proportion Killed'))

racial.summary2.df <- racial.killings.df %>%
                          dplyr::select (White, Black, `Hispanic/Latino`, `Asian/Pacific Islander`, `Native American`, `Unknown`) %>%
                            gather(White, Black, `Hispanic/Latino`, `Asian/Pacific Islander`, `Native American`, `Unknown`, key = 'race', value = 'value') %>%
                                group_by(race) %>%
                                  summarise(count = sum(value)) %>%
                                    mutate(perc = count/sum(count)) %>%
                                      dplyr::select(race, perc) %>%
                                        mutate(race = factor(race, levels = c('White', 'Black', 'Hispanic/Latino', 'Asian/Pacific Islander', 'Native American', 'Unknown'))) %>%
                                            arrange(race)

str(racial.summary2.df)
## Classes 'tbl_df', 'tbl' and 'data.frame':    6 obs. of  2 variables:
##  $ race: Factor w/ 6 levels "White","Black",..: 1 2 3 4 5 6
##  $ perc: num  0.5138 0.2686 0.1701 0.0225 0.0112 ...
actuals <- c(0.625, 0.126, 0.149, 0.05, 0.009, 0.041)

racial.summary2.df$actuals <- actuals 

print(racial.summary2.df)
## # A tibble: 6 x 3
##   race                     perc actuals
##   <fct>                   <dbl>   <dbl>
## 1 White                  0.514    0.625
## 2 Black                  0.269    0.126
## 3 Hispanic/Latino        0.170    0.149
## 4 Asian/Pacific Islander 0.0225   0.05 
## 5 Native American        0.0112   0.009
## 6 Unknown                0.0138   0.041
racial.summary2.df %>% 
  gather(key = 'key', value = 'perc', -race) %>%
      ggplot(aes(x = race, y = perc * 100, fill = key)) +
              geom_bar(stat = 'identity', position = 'dodge', alpha = 0.8) +
                xlab('Race Ethnicity') +
                  ylab('Percentage') +
                    ggtitle('Police Killings and Actual Population Proportion by Race') +
                    #labs(subtitle = 'Police Killings Adjusted by Racial Distribution of Tracts') +
                      theme_bw() +
                      theme(text = element_text(size = 16),
                          axis.text = element_text(size = 14),
                          axis.text.x = element_text(angle = 30, hjust = 1)) +
                        scale_fill_viridis_d(name = '', labels = c('Proportion in Population', 'Proportion Killed'))

Geographical Analysis

state.df <- read.csv('state_data.csv') %>%
              dplyr::select(State, percent, Region)

state.crime.df <- read.csv('crime_data_state.csv') %>%
                    mutate_all(funs(gsub(",", "", .)), c('Population', 'Violent.crime')) %>%
                      mutate(State = factor(State),
                             Population = as.numeric(Population),
                             Violent.crime = as.numeric(Violent.crime))


state.combined.df <- left_join(state.df, state.crime.df, by = 'State')

#str(state.combined.df)

state.killings.df <- police.killings.df %>% 
                      group_by(state) %>%
                        mutate(tot_killings = n()) %>%
                      ungroup() %>%
                          dplyr::select(state, tot_killings) %>%
                            distinct()
                          
#str(state.killings.df)

state.combined.df <- left_join(state.combined.df, state.killings.df, by = c('State' = 'state')) 

str(state.combined.df)
## 'data.frame':    51 obs. of  6 variables:
##  $ State        : Factor w/ 51 levels "AK","AL","AR",..: 2 1 4 3 5 6 7 9 8 10 ...
##  $ percent      : num  1.548 0.23 2.07 0.944 12.066 ...
##  $ Region       : Factor w/ 4 levels "Midwest","Northeast",..: 3 4 4 3 4 4 2 3 3 3 ...
##  $ Population   : num  4858979 738432 6828065 2978204 39144818 ...
##  $ Violent.crime: num  22952 5392 28012 15526 166883 ...
##  $ tot_killings : int  19 5 44 5 207 30 4 4 7 68 ...
killings.crimes.py <- state.combined.df %>% 
  plot_ly(x = ~tot_killings, 
          y = ~Population, 
          z = ~Violent.crime,
          color = ~Region,
          size = ~Population, 
          marker = list(symbol = 'circle', 
                        sizemode = 'diameter'), 
          sizes = c(5, 50), 
          text = ~paste('<br>State:', State,'<br># Violent Crime:', Violent.crime, '<br>Total Killings:', tot_killings,
                           '<br>Population:', Population),
          hoverinfo = 'text',
          type = 'scatter3d', 
          mode = 'markers') %>%

    layout(title = 'No of Violent Crime vs Total Killings vs Population for each State',
           scene = list(xaxis = list(title = 'Violent Crime'),
                        yaxis = list(title = 'Population'),
                        zaxis = list(title = 'Total Police Killings')))

saveWidget(killings.crimes.py, "killings.crimes.py.html")

killings.crimes.py
state.combined.df %>%
  gather(key = 'number_of', value = 'value', -State, -percent, -Region, -Population) %>%
    ggplot(aes(x = Population, y = value, color = number_of, group = number_of)) +
      geom_point(size = 1.5, alpha = 0.5) +
      geom_smooth(method = 'lm', se = FALSE, alpha = 0.5) +
        xlab('Population') +
        ggtitle('Police Killings and Violent Crimes vs Population') +
          scale_color_viridis_d(name = '', labels = c('Total Police Killings', 'No of Violent Crimes')) +
            theme_bw() +
              theme(text = element_text(size = 16),
                          axis.text = element_text(size = 14),
                          axis.text.x = element_text(angle = 30, hjust = 1)) +
                    scale_y_log10(labels = scales::comma) +
                      scale_x_continuous(labels = scales::comma)

state.combined.df %>%
  gather(key = 'number_of', value = 'value', -State, -percent, -Region, -Population) %>%
  mutate(percapita = (value * 1000)/Population) %>%
    ggplot(aes(x = Population, y = percapita, color = number_of, group = number_of)) +
      #geom_line(size = 1, alpha = 0.6) +
      geom_point(color = 'White', size = 0.0001) +
      geom_smooth(method = 'lm', se = FALSE) +
        xlab('Population') +
        ylab('Killings Per 1000') +
        ggtitle('Police Killings and Violent Crimes vs Population') +
          scale_color_viridis_d(name = 'Per 1000 people', labels = c('Total Police Killings', 'No of Violent Crimes')) +
            theme_bw() +
              theme(text = element_text(size = 16),
                          axis.text = element_text(size = 14),
                          axis.text.x = element_text(angle = 30, hjust = 1)) +
                  scale_y_log10()

## Geospatial Analysis

#tracts <- tracts(state = 'TX', cb = TRUE)
#print(tracts)

pal <- colorNumeric(palette = "YlGnBu", domain = police.killings.df$avg_house_value)

pal2 <- colorFactor(viridis(8), police.killings.df$armed)


us.killings.lt <- leaflet() %>%
  addTiles() %>%
    setView(-100.0, 40.0, zoom = 4) %>% 
    addCircleMarkers(data  = police.killings.df, lng = ~longitude, lat = ~latitude, 
                     radius = 2, clusterOptions = markerClusterOptions())

# us.killings.lt

saveWidget(us.killings.lt, "us.killings.lt.html", selfcontained = FALSE)

#us.killings.lt
ny.killings.df <- police.killings2.df %>%
                    filter(state == 'NY')

tracts <- tracts(state = 'NY', cb = TRUE)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=                                                                |   1%
  |                                                                       
  |=                                                                |   2%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |===                                                              |   4%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |====                                                             |   6%
  |                                                                       
  |====                                                             |   7%
  |                                                                       
  |=====                                                            |   7%
  |                                                                       
  |=====                                                            |   8%
  |                                                                       
  |======                                                           |   9%
  |                                                                       
  |======                                                           |  10%
  |                                                                       
  |=======                                                          |  11%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |=========                                                        |  13%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |==========                                                       |  15%
  |                                                                       
  |==========                                                       |  16%
  |                                                                       
  |===========                                                      |  16%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |============                                                     |  18%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |=============                                                    |  21%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |===============                                                  |  22%
  |                                                                       
  |===============                                                  |  23%
  |                                                                       
  |================                                                 |  24%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |==================                                               |  27%
  |                                                                       
  |==================                                               |  28%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |===================                                              |  30%
  |                                                                       
  |====================                                             |  31%
  |                                                                       
  |=====================                                            |  32%
  |                                                                       
  |=====================                                            |  33%
  |                                                                       
  |======================                                           |  34%
  |                                                                       
  |======================                                           |  35%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |========================                                         |  36%
  |                                                                       
  |========================                                         |  37%
  |                                                                       
  |=========================                                        |  38%
  |                                                                       
  |=========================                                        |  39%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |==========================                                       |  41%
  |                                                                       
  |===========================                                      |  41%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |============================                                     |  43%
  |                                                                       
  |============================                                     |  44%
  |                                                                       
  |=============================                                    |  45%
  |                                                                       
  |==============================                                   |  46%
  |                                                                       
  |===============================                                  |  47%
  |                                                                       
  |===============================                                  |  48%
  |                                                                       
  |================================                                 |  49%
  |                                                                       
  |=================================                                |  50%
  |                                                                       
  |=================================                                |  51%
  |                                                                       
  |==================================                               |  52%
  |                                                                       
  |==================================                               |  53%
  |                                                                       
  |===================================                              |  54%
  |                                                                       
  |====================================                             |  56%
  |                                                                       
  |=====================================                            |  57%
  |                                                                       
  |======================================                           |  58%
  |                                                                       
  |======================================                           |  59%
  |                                                                       
  |=======================================                          |  59%
  |                                                                       
  |========================================                         |  61%
  |                                                                       
  |========================================                         |  62%
  |                                                                       
  |=========================================                        |  63%
  |                                                                       
  |=========================================                        |  64%
  |                                                                       
  |==========================================                       |  64%
  |                                                                       
  |==========================================                       |  65%
  |                                                                       
  |===========================================                      |  66%
  |                                                                       
  |============================================                     |  67%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |=============================================                    |  69%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |===============================================                  |  72%
  |                                                                       
  |===============================================                  |  73%
  |                                                                       
  |================================================                 |  74%
  |                                                                       
  |=================================================                |  76%
  |                                                                       
  |==================================================               |  77%
  |                                                                       
  |==================================================               |  78%
  |                                                                       
  |===================================================              |  79%
  |                                                                       
  |====================================================             |  80%
  |                                                                       
  |=====================================================            |  81%
  |                                                                       
  |=====================================================            |  82%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |=======================================================          |  84%
  |                                                                       
  |=======================================================          |  85%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |========================================================         |  87%
  |                                                                       
  |=========================================================        |  87%
  |                                                                       
  |=========================================================        |  88%
  |                                                                       
  |==========================================================       |  89%
  |                                                                       
  |===========================================================      |  90%
  |                                                                       
  |===========================================================      |  91%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |=============================================================    |  93%
  |                                                                       
  |=============================================================    |  94%
  |                                                                       
  |==============================================================   |  95%
  |                                                                       
  |==============================================================   |  96%
  |                                                                       
  |===============================================================  |  97%
  |                                                                       
  |================================================================ |  98%
  |                                                                       
  |================================================================ |  99%
  |                                                                       
  |=================================================================| 100%
ny.killings2.df <- geo_join(tracts, ny.killings.df, 'GEOID', 'code')
ny.killings2.df <- geo_join(ny.killings2.df, census2.df, 'GEOID', 'code')

pal <- colorNumeric(palette = "YlGnBu", domain = ny.killings2.df$share_black.1)

ny.killings.lt <- ny.killings2.df %>%
  leaflet() %>%
  setView(-74.0, 40.8, zoom = 10) %>%
  addProviderTiles('CartoDB.Positron') %>% 
  addPolygons(fillColor = ~pal(share_black.1), 
              color = "#b2aeae",
              fillOpacity = 0.5, 
              weight = 1, 
              smoothFactor = 0.2) %>%
  addLegend(pal = pal, 
            values = ~share_black.1, 
            position = 'topleft', 
            title = 'Percent of Black Population',
            labFormat = labelFormat(suffix = '%'))  %>%
  addCircleMarkers(data  = ny.killings2.df, lng = ~longitude, lat = ~latitude, 
                     radius = 4, color = 'red') 

saveWidget(ny.killings.lt, "ny.killings.lt.html", selfcontained = FALSE)
# ny.killings.lt
city.data.df <- read.csv('city_2015.csv', header = TRUE)

police.killings.city.df <- police.killings.df %>%
                              group_by(city) %>%
                                mutate(killings = n()) %>%
                                  dplyr::select(city, killings) %>%
                                    left_join(city.data.df, by = c('city'= 'department_name')) %>%
                                      dplyr::select(city, killings, total_crime)
## Warning: Column `city`/`department_name` joining factors with different
## levels, coercing to character vector
police.killings.city.df <- na.omit(police.killings.city.df)

str(police.killings.city.df)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame':  301 obs. of  3 variables:
##  $ city       : chr  "Wichita" "San Francisco" "Los Angeles" "Pittsburgh" ...
##  $ killings   : int  2 9 19 4 2 3 5 10 3 4 ...
##  $ total_crime: int  7678 13420 50312 4334 3284 5030 10812 22248 7256 3320 ...
##  - attr(*, "groups")=Classes 'tbl_df', 'tbl' and 'data.frame':   57 obs. of  2 variables:
##   ..$ city : chr  "Albuquerque" "Arlington" "Atlanta" "Aurora" ...
##   ..$ .rows:List of 57
##   .. ..$ : int  7 90 164 247 286
##   .. ..$ : int  55 101 172 257 279
##   .. ..$ : int  48 63 87 267
##   .. ..$ : int  10 47 266 277
##   .. ..$ : int  15 30 105 134 135 147 265 301
##   .. ..$ : int  16 99 250
##   .. ..$ : int 123
##   .. ..$ : int  36 289
##   .. ..$ : int  75 127 143 194 207 214 220 290 291 298
##   .. ..$ : int  116 122 156 196
##   .. ..$ : int  51 60 161 224 274
##   .. ..$ : int  20 61 100 149 222 227 237 264
##   .. ..$ : int  32 113 115 140 167 189 211 261 284
##   .. ..$ : int  17 148 260 262 270 278 285
##   .. ..$ : int  83 104 203 288
##   .. ..$ : int  6 86 103
##   .. ..$ : int  14 92 98 163
##   .. ..$ : int  170 204 218 228 296
##   .. ..$ : int  58 94
##   .. ..$ : int  12 22 23 37 72 74 109 124 165 181 ...
##   .. ..$ : int  8 39 68 71 126 174 206 217 273 276
##   .. ..$ : int  93 131 184 236 295
##   .. ..$ : int  35 56 107 146 151 205
##   .. ..$ : int  42 145 160 171 185 221 243 263 281 282 ...
##   .. ..$ : int  80 173 199 283
##   .. ..$ : int  3 13 44 53 65 69 78 91 139 144 ...
##   .. ..$ : int  120 169 230
##   .. ..$ : int  27 33 52 155
##   .. ..$ : int  85 130 137 269
##   .. ..$ : int  73 82 118 138 187 200 209 213 255
##   .. ..$ : int 46
##   .. ..$ : int  11 254
##   .. ..$ : int  89 108
##   .. ..$ : int  84 159 280
##   .. ..$ : int  79 81 95 106 114 117 121 179
##   .. ..$ : int 197
##   .. ..$ : int  24 112 168 177 188 252
##   .. ..$ : int  29 41 45 50 133 136 158
##   .. ..$ : int  21 40 102 176
##   .. ..$ : int  28 31 153 246 256
##   .. ..$ : int  110 157 234 253
##   .. ..$ : int  19 25 26 49 67 76 141 154 186 292
##   .. ..$ : int  4 125 251 299
##   .. ..$ : int  62 129 242
##   .. ..$ : int 96
##   .. ..$ : int  5 212
##   .. ..$ : int  57 70 77 119 128 192 193 271
##   .. ..$ : int  88 97 132 191 216 231 235 241
##   .. ..$ : int  2 18 43 59 111 162 223 249 268
##   .. ..$ : int  34 64 175 180 182 198 248
##   .. ..$ : int  152 272 294
##   .. ..$ : int  142 232
##   .. ..$ : int  166 195 275
##   .. ..$ : int  9 38 66
##   .. ..$ : int  201 202
##   .. ..$ : int  54 233 239 244 259 287 300
##   .. ..$ : int  1 183
##   ..- attr(*, ".drop")= logi TRUE
##  - attr(*, "na.action")= 'omit' Named int  1 2 3 5 7 8 9 11 12 14 ...
##   ..- attr(*, "names")= chr  "1" "2" "3" "5" ...
police.killings.city.df %>% 
  filter(total_crime > 0) %>%
  ggplot(aes(x = total_crime, y = killings)) +
    geom_point(size = 1, alpha = 0.6) +
      geom_smooth(method = 'lm', se = FALSE, size = 1.5) +
      xlab('Total Crime') +
        ylab('Police Killings') +
          ggtitle('Total Crime vs Police Killings') +
            theme_bw() +
              theme(text = element_text(size = 16),
                          axis.text = element_text(size = 14),
                          axis.text.x = element_text(angle = 30, hjust = 1)) +
                scale_x_log10(labels = scales::comma)

kill = read.csv("police_killings_2015.csv")

kills_per_city = plyr::count(kill, "city")
str(kills_per_city)
## 'data.frame':    752 obs. of  2 variables:
##  $ city: Factor w/ 752 levels "Aguanga","Aiken",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ freq: int  1 1 3 1 1 5 1 1 2 1 ...
kills_per_city = kills_per_city %>% arrange(desc(freq))
setnames(kills_per_city, "freq", "kills")
crime = read.csv("ucr_crime_1975_2015.csv", stringsAsFactors = FALSE)
crime_per_city = filter(crime, year == 2015)

crime_per_city = within(crime_per_city, rm(ORI, months_reported, source, url, violent_per_100k, homs_per_100k, rape_per_100k, rob_per_100k, agg_ass_per_100k))

crime_per_city$total_crime = crime_per_city$homs_sum + crime_per_city$rape_sum + crime_per_city$rob_sum + crime_per_city$agg_ass_sum + crime_per_city$violent_crime

crime_per_city = crime_per_city %>% arrange(desc(total_crime))
crime_per_city$department_name[1] = "New York"
crime_per_city$department_name[41] = "Austin"
crime_per_city$department_name[30] = "Columbus"
setnames(crime_per_city, "department_name", "city")
top_kills_15 = kills_per_city[1:16,]
top_kills_15 <- subset(top_kills_15, city!="Bakersfield")

kills_and_crime <- merge(top_kills_15,crime_per_city,by="city")

kills_and_crime = within(kills_and_crime, rm(year, total_pop, homs_sum, rape_sum, rob_sum, agg_ass_sum, violent_crime))
kills_and_crime = kills_and_crime %>% arrange(desc(kills))
kills_and_crime2 <- kills_and_crime %>%
                      mutate(kills = (kills - min(kills))/(range(kills)[2] - range(kills)[1]), 
                             total_crime = (total_crime - min(total_crime))/(range(total_crime)[2] - range(total_crime)[1])) %>%
                        gather(key = 'key', value = 'value', -city) %>%
                          mutate(key = factor(key))

#kills_and_crime2
kills_and_crime2$value[15] = 0.01
kills_and_crime2$value[25] = 0.01
kills_and_crime2 <- transform(kills_and_crime2, value = ifelse(key == "kills", kills_and_crime2$value*-1, value))
#kills_and_crime2
kills_and_crime3 <- kills_and_crime2 %>% mutate (city = factor(city, levels = c("Los Angeles", "Houston", "Las Vegas", "Chicago", "Indianapolis", "Phoenix", "Dallas", "Miami", "San Francisco", "Austin", "Columbus", "New York", "San Antonio", "San Diego", "Denver")))

n1 <- ggplot(kills_and_crime3, aes(x = city, y = value, fill = key)) + 
  geom_bar(data = subset(kills_and_crime3, key == "kills"), stat = "identity") + 
  geom_bar(data = subset(kills_and_crime3, key == "total_crime"), stat = "identity") + 
  scale_y_continuous(breaks = seq(-1, 1, 0.25),
                     labels = paste0(as.character(c(seq(1, 0, -0.25), seq(0.25, 1, 0.25)))))+ 
  coord_flip() + 
  scale_fill_viridis_d(name = '', labels = c('Total Police Kills', 'Total Crimes Commited')) +
  labs(caption = "All the values are normalised", title = ("Top 10 Cities (based on Police Kills)\n")) +
  xlab("City") + ylab("Police Killings vs Crimes Commited") + 
  theme_bw() + theme(text = element_text(size = 16),
                              axis.text = element_text(size = 16))

n1

Models

killings <- read.csv("police_killings_2015.csv")
states <- killings %>% group_by(state) %>% tally() %>% filter(state != "DC")
state.census <- get_estimates(geography = "state", product = "population")
state.vars <- get_estimates(geography = "state", product = "characteristics", breakdown = c("RACE"),          breakdown_labels = TRUE) %>% spread(RACE, value)
state.demo <- get_estimates(geography = "state", product = "characteristics", breakdown = c("RACE"),          breakdown_labels = TRUE) %>% spread(RACE, value) %>% mutate(non.white.perc = 1 - `White alone or in combination`/`All races`, black.perc = `Black alone or in combination`/`All races`) %>% dplyr::select(NAME, non.white.perc, black.perc)
state.demo$state.code =  state.abb[match(state.demo$NAME, state.name)]
state.census$state.code = state.abb[match(state.census$NAME, state.name)]
state.census <- state.census %>% spread(variable, value) %>% dplyr::select(-NAME)
state.killings <- left_join(states, state.census, by=c("state"="state.code")) %>% left_join(state.demo, by=c("state"="state.code")) %>% dplyr::rename(killed = n)
## Warning: Column `state`/`state.code` joining factor and character vector,
## coercing into character vector
## glm(formula = killed ~ 1, family = "poisson", data = state.killings)
##             coef.est coef.se
## (Intercept) 3.13     0.03   
## ---
##   n = 50, k = 1
##   residual deviance = 1302.8, null deviance = 1302.8 (difference = 0.0)
## glm(formula = killed ~ 1, family = "poisson", data = state.killings, 
##     offset = log(POP))
##             coef.est coef.se
## (Intercept) -12.56     0.03 
## ---
##   n = 50, k = 1
##   residual deviance = 239.3, null deviance = 239.3 (difference = 0.0)
race.model <- glm(killed ~ black.perc, family = "quasipoisson", offset = log(POP), data = state.killings)
# display(race.model)
# eth.co <- coefficients(summary(race.model))[1:3, 1:2]
# ethnicity = c("Black", "Non-White")
# estimate = exp(eth.co[2:3, 1])
# lower = exp(eth.co[2:3, 1] - 2 * eth.co[2:3, 2])
# upper = exp(eth.co[2:3, 1] + 2 * eth.co[2:3, 2])
# eth.co.df = data.frame(ethnicity, estimate, lower, upper)
state.df <- expand.grid(POP = seq(1e6, 4e7, 9e4), black.perc = seq(0.05, 0.75, 0.05))
predicted.kill <- predict(race.model, type="response", newdata=state.df)
pop.pred.df <- data.frame(state.df, predicted = as.vector(predicted.kill)) %>% mutate(NAME = "", real = FALSE)
ggplot(pop.pred.df, aes(x = log10(POP), y = predicted, color = black.perc * 100)) + geom_line(alpha=0.6) + ggtitle("Predicted Number of Deadly Police Encounters by State", subtitle = "Quasi-Poisson Model, Offset By Population") + labs(color = "% Black Population", x = "State Population (Log Scale)", y = "Predicted Number of Killings")  + scale_color_viridis_c(option = 'viridis', direction = -1) + theme_bw()

state.pred <- predict(race.model, type="response", newdata = state.killings)
state.diffs <- data.frame(state.killings$NAME, state.killings$killed, predicted = as.vector(state.pred)) %>% mutate(MAD = abs(state.killings.killed - predicted), actual = state.killings.killed, NAME=state.killings.NAME) %>% left_join(state.killings %>% dplyr::select("NAME", "black.perc", "POP"), by=c("NAME"="NAME")) %>% dplyr::select("NAME", "actual", "predicted", "black.perc", "MAD", "POP")
## Warning: Column `NAME` joining factor and character vector, coercing into
## character vector
ggplot(state.diffs %>% filter(actual < 50), aes(x=actual, y=predicted, label = NAME, color = black.perc * 100)) + geom_point() + geom_smooth(method = "lm", se = F) + geom_text(aes(label = ifelse(MAD > 15 | MAD < 1, NAME, "")), alpha = .8) + ggtitle("State Model Accuracy (Actual < 50)", subtitle = "Difference in Model Fit, Outliers and Exact Predictions Labeled") + labs(y="Model Predicted Killings", x="Actual Killings", color = "% Black Population") + scale_color_viridis_c(option = 'viridis', direction = -1) + theme_bw() 

library(broom)
## Warning: package 'broom' was built under R version 3.5.2
residuals <- broom::augment(race.model)
ggplot(residuals, aes(x=.resid, y=.fitted)) + geom_point() + geom_smooth()+ ggtitle("State Model Accuracy (Actual < 50)", subtitle = "Difference in Model Fit, Outliers and Exact Predictions Labeled") + labs(y="Model Predicted Killings", x="Actual Killings", color = "% Black Population") + scale_color_viridis_c(option = 'viridis', direction = -1) + theme_bw() 
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

county.killings <- read.csv("police_killing_combined.csv")
#str(county.killings)
county.counts <- county.killings %>% 
  mutate(fipscode = as.character(code)) %>% 
  mutate(fipscode = str_pad(fipscode, 11, side = "left", pad="0")) %>%
  dplyr::select(fipscode)
county.counts <- county.counts %>% mutate(fips = str_sub(fipscode, 1, 5)) %>% group_by(fips) %>% tally() %>% rename(killed = n)
county.demo <- get_estimates(geography = "county", product = "characteristics", breakdown = c("RACE"),          breakdown_labels = TRUE) %>% spread(RACE, value) %>% mutate(non.white.perc = 1 - `White alone or in combination`/`All races`, black.perc = `Black alone or in combination`/`All races`) %>% dplyr::select(NAME, non.white.perc, black.perc, GEOID)
county.census <- get_estimates(geography = "county", product = "population") %>% spread(variable, value) %>% dplyr::select(-NAME)
county.counts <- left_join(county.counts, county.demo, by=c("fips"="GEOID")) %>% left_join(county.census, by=c("fips"="GEOID"))
county.const.model <- glm(killed ~ 1, family = "poisson",   data = county.counts)
display(county.const.model) #deviance 1104
## glm(formula = killed ~ 1, family = "poisson", data = county.counts)
##             coef.est coef.se
## (Intercept) 0.72     0.03   
## ---
##   n = 541, k = 1
##   residual deviance = 1104.2, null deviance = 1104.2 (difference = 0.0)
county.pop.model <- glm(killed ~ 1, family = "poisson", offset = log(POP), data = county.counts)
display(county.pop.model) #deviance = 877
## glm(formula = killed ~ 1, family = "poisson", data = county.counts, 
##     offset = log(POP))
##             coef.est coef.se
## (Intercept) -12.14     0.03 
## ---
##   n = 541, k = 1
##   residual deviance = 877.2, null deviance = 877.2 (difference = 0.0)
county.race.model <- glm(killed ~  non.white.perc , family = "quasipoisson", offset = log(POP), data = county.counts)
display(county.race.model) #deviance = 786
## glm(formula = killed ~ non.white.perc, family = "quasipoisson", 
##     data = county.counts, offset = log(POP))
##                coef.est coef.se
## (Intercept)    -11.77     0.11 
## non.white.perc  -1.65     0.44 
## ---
##   n = 541, k = 2
##   residual deviance = 828.0, null deviance = 877.2 (difference = 49.2)
##   overdispersion parameter = 3.2
county.grid <- expand.grid(POP = seq(2000, 5e6, 6000), non.white.perc = seq(0.05, 0.6, 0.05))
county.predicted.kill <- predict(county.race.model, type="response", newdata=county.grid)
county.pred.df <- data.frame(county.grid, predicted = as.vector(county.predicted.kill))
ggplot(county.pred.df, aes(x = log10(POP), y = predicted, color = 100 * non.white.perc)) + geom_line(alpha = 0.6) + ggtitle("Predicted Number of Deadly Police Encounters By County", subtitle = "Quasi-Poisson Model, Offset By Population") + labs(color = "% Non-White", x = "County Population (Log 10 Scale)", y = "Predicted Number of Deaths") + scale_color_viridis_c(option = 'blues', direction = -1) + theme_bw()
## Warning in viridisLite::viridis(n, alpha, begin, end, direction, option):
## Option 'blues' does not exist. Defaulting to 'viridis'.

county.pred <- predict(county.race.model, type="response", newdata = county.counts)
county.diffs <- data.frame(county.counts$NAME, county.counts$killed, predicted = as.vector(county.pred)) %>% mutate(MAD = abs(county.counts.killed - predicted), actual = county.counts.killed, NAME=county.counts.NAME) %>% left_join(county.counts %>% dplyr::select("NAME", "non.white.perc", "POP"), by=c("NAME"="NAME")) %>% dplyr::select("NAME", "actual", "predicted", "non.white.perc", "MAD", "POP")
## Warning: Column `NAME` joining factor and character vector, coercing into
## character vector
ggplot(county.diffs %>% filter(actual < 20 & actual > 3), aes(x=actual, y=predicted, label = NAME, color = non.white.perc * 100)) + geom_point() + geom_smooth(method = "lm", se = F) + geom_text(aes(label = ifelse(MAD > 8 , NAME, "")), alpha = .8) + ggtitle("County Model Accuracy (Actual < 20 and > 3)", subtitle = "Difference in Model Fit, Outliers Labeled") + labs(y="Model Predicted Killings", x="Actual Killings", color = "% NonWhite Population") + scale_color_viridis_c(option = 'viridis', direction = -1) + theme_bw()